This Rmarkdown file assesses the output of CheckV, DeepVirFinder, Kaiju, VIBRANT, VirSorter, and VirSorter2 on multiple training sets of microbial DNA, primarily from NCBI. Created from fungal, viral, bacterial, archeael, protist, and plasmid DNA sequences
Please reach out to James Riddell (riddell.26@buckeyemail.osu.edu) or Bridget Hegarty (beh53@case.edu) regarding any issues, or open an issue on github.
library(ggplot2)
There were 30 warnings (use warnings() to see them)
library(plyr)
library(reshape2)
library(viridis)
library(tidyr)
library(dplyr)
library(readr)
library(data.table)
library(pROC)
Type 'citation("pROC")' for a citation.
Attaching package: ‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
library("stringr")
Import the file that combines the results from each of the tools from running “combining_tool_output.Rmd”:
viruses <- read_tsv("../IntermediaryFiles/viral_tools_combined.tsv")
── Column specification ─────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
seqtype = col_character(),
contig = col_character(),
checkv_provirus = col_character(),
checkv_quality = col_character(),
method.x = col_character(),
Classified = col_character(),
IDs_all = col_character(),
Seq = col_character(),
Kaiju_Viral = col_character(),
Kingdom = col_character(),
type = col_character(),
vibrant_quality = col_character(),
method.y = col_character(),
vibrant_prophage = col_character(),
vs2type = col_character(),
max_score_group = col_character(),
provirus = col_logical()
)
ℹ Use `spec()` for the full column specifications.
There were 50 or more warnings (use warnings() to see the first 50)
This section defines a viralness score “keep_score” based on the tool classifications. A final keep_score above 1 indicates we will keep that sequence and call it viral.
VIBRANT Quality == “High Quality Draft”: +1 Quality == “Medium Quality Draft”: +1 Quality == “Low Quality Draft” & provirus: +0.5
Virsorter2 Viral >= 50: +0.5 Viral >= 0.95: +0.5 RNA >= 0.9: +1 lavidaviridae >= 0.9: +1 NCLDV >= 0.9: +1
Virsorter category == 1,4: +1 category == 2,5: +0.5
DeepVirFinder: Score >= 0.7: +0.5
Tuning - No Viral Signature: Kaiju_viral = “cellular organisms”: -0.5 If host_genes >50 and NOT provirus: -1 If viral_genes == 0 and host_genes >= 1: -1 If 3*viral_genes <= host_genes and NOT provirus: -1 If length > 50,000 and hallmark <=1: -1 If length < 5000 and checkv completeness <= 75: -0.5
Tuning - Viral Signature: Kaiju_viral = “Viruses”: +0.5 If %unknown >= 75 and length < 50000: +0.5 If %viral >= 50: +0.5 Hallmark > 2: +0.5
This script produces visualizations of these combined viral scorings and includes ecological metrics like alpha diversity.
You can decide which combination is appropriate for them and only need use the tools appropriate for your data.
getting_viral_set_1 <- function(input_seqs,
There were 50 or more warnings (use warnings() to see the first 50)
include_vibrant=FALSE,
include_virsorter2=FALSE,
include_deepvirfinder=FALSE,
include_tuning_viral=FALSE,
include_tuning_not_viral=FALSE,
include_virsorter=FALSE) {
keep_score <- rep(0, nrow(input_seqs))
if (include_vibrant) {
keep_score[input_seqs$vibrant_quality=="high quality draft"] <- keep_score[input_seqs$vibrant_quality=="high quality draft"] + 1
keep_score[input_seqs$vibrant_quality=="medium quality draft"] <- keep_score[input_seqs$vibrant_quality=="medium quality draft"] + 1
keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus] <- keep_score[input_seqs$vibrant_quality=="low quality draft" & input_seqs$provirus] + 0.5
}
if (include_virsorter2) {
#keep_score[input_seqs$viral>=50 | input_seqs$lavidaviridae>=0.95 | input_seqs$NCLDV>=0.95] <- keep_score[input_seqs$viral>=50 | input_seqs$lavidaviridae>=0.95 | input_seqs$NCLDV>=0.95] + 0.5
keep_score[input_seqs$max_score>=50] <- keep_score[input_seqs$max_score>=50] + 0.5
keep_score[input_seqs$max_score>=95] <- keep_score[input_seqs$max_score>=95] + 0.5
#keep_score[input_seqs$RNA>=0.95] <- keep_score[input_seqs$RNA>=0.95] + 1
}
if (include_virsorter) {
keep_score[input_seqs$category==1] <- keep_score[input_seqs$category==1] + 1
keep_score[input_seqs$category==2] <- keep_score[input_seqs$category==2] + 0.5
keep_score[input_seqs$category==4] <- keep_score[input_seqs$category==4] + 1
keep_score[input_seqs$category==5] <- keep_score[input_seqs$category==5] + 0.5
}
if (include_deepvirfinder) {
keep_score[input_seqs$score>=0.7 & input_seqs$checkv_length<20000] <- keep_score[input_seqs$score>=0.7 & input_seqs$checkv_length<20000] + 0.5
keep_score[input_seqs$score>=0.9 & input_seqs$checkv_length<20000] <- keep_score[input_seqs$score>=0.9 & input_seqs$checkv_length<20000] + 0.5
}
if (include_tuning_viral) {
keep_score[input_seqs$Kaiju_Viral=="Viruses"] <- keep_score[input_seqs$Kaiju_Viral=="Viruses"] + 0.5
keep_score[input_seqs$hallmark>2] <- keep_score[input_seqs$hallmark>2] + 0.5
keep_score[input_seqs$percent_unknown>=75 & input_seqs$checkv_length<50000] <- keep_score[input_seqs$percent_unknown>=75 & input_seqs$checkv_length<50000] + 0.5
keep_score[input_seqs$viral>=50] <- keep_score[input_seqs$viral>=50] + 0.5
}
if (include_tuning_not_viral) {
keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] <- keep_score[input_seqs$Kaiju_Viral=="cellular organisms"] - 0.5
keep_score[input_seqs$checkv_host_genes>50 & !input_seqs$provirus] <- keep_score[input_seqs$checkv_host_genes>50 & !input_seqs$provirus] - 1
keep_score[input_seqs$checkv_viral_genes==0 & input_seqs$checkv_host_genes>=1] <- keep_score[input_seqs$checkv_viral_genes==0 & input_seqs$checkv_host_genes>=1] - 1
keep_score[((input_seqs$checkv_viral_genes*3) <= input_seqs$checkv_host_genes) & !input_seqs$provirus] <- keep_score[((input_seqs$checkv_viral_genes*3) <= input_seqs$checkv_host_genes) & !input_seqs$provirus] - 1
keep_score[input_seqs$checkv_length>500000 & input_seqs$hallmark<=1] <- keep_score[input_seqs$checkv_length>500000 & input_seqs$hallmark<=1] - 1
keep_score[(input_seqs$checkv_completeness<=75 | input_seqs$vibrant_quality=="complete circular")& input_seqs$checkv_length<=5000] <- keep_score[input_seqs$checkv_completeness<=75 & input_seqs$checkv_length<=5000] - 0.5
}
return(keep_score)
}
note that this is only as accurate as the annotations of the input sequences
this function calculates the precision, recall, and F1 score for each pipeline
assess_performance <- function(seqtype, keep_score) {
truepositive <- rep("not viral", length(seqtype))
truepositive[seqtype=="virus"] <- "viral"
#make confusion matrix
confusion_matrix <- rep("true negative", length(keep_score))
confusion_matrix[truepositive=="viral" & keep_score<=1] <- "false negative"
confusion_matrix[truepositive=="viral" & keep_score>=1] <- "true positive"
confusion_matrix[truepositive=="not viral" & keep_score>=1] <- "false positive"
TP <- table(confusion_matrix)[4]
FP <- table(confusion_matrix)[2]
TN <- table(confusion_matrix)[3]
FN <- table(confusion_matrix)[1]
precision <- TP/(TP+FP)
recall <- TP/(TP+FN)
F1 <- 2*precision*recall/(precision+recall)
MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
auc <- round(auc(truepositive, keep_score),4)
#by type metrics
fungal_FP <- table(confusion_matrix[seqtype=="fungi"])[2]
protist_FP <- table(confusion_matrix[seqtype=="protist"])[2]
bacterial_FP <- table(confusion_matrix[seqtype=="bacteria"])[2]
viral_FN <- table(confusion_matrix[seqtype=="virus"])[1]
performance <- c(precision, recall, F1, MCC, auc, fungal_FP,
protist_FP, bacterial_FP, viral_FN)
names(performance) <- c("precision", "recall", "F1", "MCC", "AUC", "fungal_FP",
"protist_FP", "bacterial_FP", "viral_FN")
return(performance)
}
combination of tools list
combos_list <- data.frame(toolcombo=rep(0, 64),
There were 27 warnings (use warnings() to see them)
tune_not_viral=rep(0, 64),
DVF=rep(0, 64),
tune_viral=rep(0, 64),
VIBRANT=rep(0, 64),
VS=rep(0, 64),
VS2=rep(0, 64))
p <- 1
for (i in c(0,1)){
for (j in c(0,1)){
for (k in c(0,1)){
for (l in c(0,1)){
for (m in c(0,1)){
for (n in c(0,1)){
combos_list$toolcombo[p] <- paste(i,j,k,l,m,n)
combos_list$toolcombo2[p] <- paste(if(i){"tv"}else{"0"},if(j){"DVF"}else{"0"},
if(k){"tnv"}else{"0"},if(l){"VB"}else{"0"},
if(m){"VS"}else{"0"},if(n){"VS2"}else{"0"})
combos_list$tune_not_viral[p] <- i
combos_list$DVF[p] <- j
combos_list$tune_viral[p] <- k
combos_list$VIBRANT[p] <- l
combos_list$VS[p] <- m
combos_list$VS2[p] <- n
p <- p+1
}
}
}
}
}
}
combos_list <- combos_list[-1,]
this function builds a list of all of the combinations that the user wants to test. In this case, we’re comparing the performance of all unique combinations of the six tools.
build_score_list <- function(input_seqs, combos) {
output <- data.frame(precision=rep(0, nrow(combos)),
recall=rep(0, nrow(combos)),
F1=rep(0, nrow(combos)),
MCC=rep(0, nrow(combos)),
AUC=rep(0, nrow(combos)),
fungal_FP=rep(0, nrow(combos)),
protist_FP=rep(0, nrow(combos)),
bacterial_FP=rep(0, nrow(combos)),
viral_FN=rep(0, nrow(combos)))
for (i in 1:nrow(combos)) {
keep_score <- getting_viral_set_1(input_seqs, include_vibrant = combos$VIBRANT[i],
include_virsorter = combos$VS[i],
include_virsorter2 = combos$VS2[i],
include_tuning_viral = combos$tune_viral[i],
include_tuning_not_viral = combos$tune_not_viral[i],
include_deepvirfinder = combos$DVF[i])
output[i,1:9] <- assess_performance(input_seqs$seqtype, keep_score)
output$toolcombo[i] <- paste(combos$tune_viral[i],combos$DVF[i],
combos$tune_not_viral[i], combos$VIBRANT[i],
combos$VS[i], combos$VS2[i])
}
output[is.na(output)] <- 0
return (output)
}
accuracy_scores <- data.frame(testing_set_index=rep(0, nrow(combos_list)*10),
precision=rep(0, nrow(combos_list)*10),
recall=rep(0, nrow(combos_list)*10),
F1=rep(0, nrow(combos_list)*10),
MCC=rep(0, nrow(combos_list)*10),
AUC=rep(0, nrow(combos_list)*10),
fungal_FP=rep(0, nrow(combos_list)*10),
protist_FP=rep(0, nrow(combos_list)*10),
bacterial_FP=rep(0, nrow(combos_list)*10),
viral_FN=rep(0, nrow(combos_list)*10))
accuracy_scores <- cbind(testing_set_index=rep(1, nrow(combos_list)),
build_score_list(viruses[viruses$Index==1,], combos_list))
for (i in 2:10) {
accuracy_scores <- rbind(accuracy_scores,
cbind(testing_set_index=rep(i, nrow(combos_list)),
build_score_list(viruses[viruses$Index==i,], combos_list)))
}
accuracy_scores$numrules <- str_count(accuracy_scores$toolcombo, "1")
There were 50 or more warnings (use warnings() to see the first 50)
#accuracy_scores <- accuracy_scores[order(accuracy_scores$numrules, decreasing=F),]
accuracy_scores <- accuracy_scores[order(accuracy_scores$MCC, decreasing=F),]
accuracy_scores$toolcombo <- factor(accuracy_scores$toolcombo, levels = unique(accuracy_scores$toolcombo))
accuracy_scores$numrules <- as.factor(accuracy_scores$numrules)
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
There were 27 warnings (use warnings() to see them)
p2 <- ggplot(accuracy_scores, aes(x=toolcombo, y=F1,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("F1 Score")
p2
ggplot(accuracy_scores, aes(x=toolcombo, y=precision,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Precision")
ggplot(accuracy_scores, aes(x=toolcombo, y=recall,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Recall")
ggplot(accuracy_scores, aes(x=precision, y=recall,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=20),
) +
xlab("Precision") +
ylab("Recall") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1))
ggplot(accuracy_scores[accuracy_scores$testing_set_index==1,], aes(x=precision, y=recall)) +
geom_label(alpha=0.7, label=accuracy_scores$toolcombo[accuracy_scores$testing_set_index==1]) +
geom_point(alpha=0.5, aes(color=numrules, fill=numrules)) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Precision") +
ylab("Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=abs(precision-recall),
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Precision-Recall")
ggplot(accuracy_scores, aes(x=toolcombo, y=MCC,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=20),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("MCC") +
scale_fill_manual(name="Number of Rule Sets",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="Number of Rule Sets",
values = alpha(rev(pal(6)), 1))
ggplot(accuracy_scores, aes(x=toolcombo, y=AUC,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("AUC")
ggplot(accuracy_scores, aes(x=toolcombo, y=fungal_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Fungal False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=protist_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Protist False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=bacterial_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Bacterial False Positives")
ggplot(accuracy_scores, aes(x=toolcombo, y=viral_FN,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Viral False Negatives")
ggplot(accuracy_scores, aes(x=toolcombo, y=viral_FN,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (tv, DVF, tnv, VB, VS, VS2)") +
ylab("Viral False Negatives")
ggplot(accuracy_scores, aes(x=protist_FP, y=fungal_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Protist FP") +
ylab("Fungal FP")
ggplot(accuracy_scores, aes(x=recall, y=fungal_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Recall") +
ylab("Fungal FP")
ggplot(accuracy_scores, aes(x=recall, y=protist_FP,
color=numrules, fill=numrules)) +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Recall") +
ylab("Protist FP")
accuracy_scores_melt <- accuracy_scores %>%
There were 50 or more warnings (use warnings() to see the first 50)
select(testing_set_index, precision, recall, MCC, numrules, toolcombo) %>%
pivot_longer(cols=c(precision, recall, MCC),
names_to="performance_metric",
values_to="performance_metric_score")
ggplot(accuracy_scores_melt[accuracy_scores_melt$performance_metric=="MCC",], aes(x=numrules, y=performance_metric_score,
There were 30 warnings (use warnings() to see them)
color=numrules, fill=numrules)) +
geom_boxplot() +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("MCC") +
xlab("Number of Rule Sets") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1))
NA
ggplot(accuracy_scores_melt[accuracy_scores_melt$performance_metric=="recall",], aes(x=numrules, y=performance_metric_score,
There were 30 warnings (use warnings() to see them)
color=numrules, fill=numrules)) +
geom_boxplot() +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("recall") +
xlab("Number of Rule Sets") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1))
NA
ggplot(accuracy_scores_melt[accuracy_scores_melt$performance_metric!="MCC",], aes(x=numrules, y=performance_metric_score,
color=numrules, fill=numrules)) +
geom_boxplot() +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("Score") +
xlab("Number of Rule Sets") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
facet_wrap(~performance_metric)
NA
comparing metric with and without tuning rules
accuracy_scores_melt$tuning_inc <- "no"
There were 50 or more warnings (use warnings() to see the first 50)
accuracy_scores_melt$tuning_inc[substring(accuracy_scores_melt$toolcombo, 1, 1)==1] <- "tv"
accuracy_scores_melt$tuning_inc[substring(accuracy_scores_melt$toolcombo, 3, 3)==1] <- "tnv"
accuracy_scores_melt$tuning_inc[substring(accuracy_scores_melt$toolcombo, 1, 1)==1 &
substring(accuracy_scores_melt$toolcombo, 3, 3)==1] <- "both"
ggplot(accuracy_scores_melt, aes(x=tuning_inc, y=performance_metric_score)) +
geom_boxplot() +
geom_point(aes(color=numrules, fill=numrules), alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("MCC") +
xlab("Number of Tools") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
facet_wrap(~performance_metric)
ggplot(accuracy_scores_melt, aes(x=tuning_inc, y=performance_metric_score,
color=numrules, fill=numrules)) +
geom_boxplot() +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("MCC") +
xlab("Number of Tools") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
facet_wrap(~performance_metric)
ggplot(accuracy_scores_melt[accuracy_scores_melt$performance_metric!="MCC",], aes(x=tuning_inc, y=performance_metric_score)) +
geom_boxplot() +
geom_boxplot(aes(color=numrules, fill=numrules)) +
# geom_point(aes(color=numrules, fill=numrules), alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("Score") +
xlab("") +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.3)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 0.7)) +
facet_wrap(~performance_metric)
NA
write_tsv(accuracy_scores, "20221029_accuracy_scores.tsv")
There were 50 or more warnings (use warnings() to see the first 50)
to do: add in clustering and ordination like in the drinking water R notebook
viruses$keep_score_high_precision <- getting_viral_set_1(viruses, include_deepvirfinder = T,
There were 20 warnings (use warnings() to see them)
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = T,
include_virsorter = F)
viruses$confusion_matrix_high_precision <- "true negative"
There were 25 warnings (use warnings() to see them)
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision<1] <- "false negative"
viruses$confusion_matrix_high_precision[viruses$seqtype=="virus" & viruses$keep_score_high_precision>=1] <- "true positive"
viruses$confusion_matrix_high_precision[viruses$seqtype!="virus" & viruses$keep_score_high_precision>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index))
The melt generic in data.table has been passed a table and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(table(viruses$confusion_matrix_high_precision, viruses$seqtype, viruses$Index)). In the next version, this warning will become an error.
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
length(grep("true", viruses$confusion_matrix_high_precision))/nrow(viruses)
[1] 0.9206364
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
There were 44 warnings (use warnings() to see them)
ggplot(confusion_by_taxa, aes(y=count, x=confusion_matrix,
There were 44 warnings (use warnings() to see them)
fill=confusion_matrix,
color=confusion_matrix)) +
geom_boxplot() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free")
this rule set had the highest precision, but as you can see, this comes with a big sacrifice in recall
viruses$keep_score_high_MCC <- getting_viral_set_1(viruses, include_deepvirfinder = F,
There were 50 or more warnings (use warnings() to see the first 50)
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = T)
viruses$confusion_matrix_high_MCC <- "true negative"
viruses$confusion_matrix_high_MCC[viruses$seqtype=="virus" & viruses$keep_score_high_MCC<1] <- "false negative"
viruses$confusion_matrix_high_MCC[viruses$seqtype=="virus" & viruses$keep_score_high_MCC>=1] <- "true positive"
viruses$confusion_matrix_high_MCC[viruses$seqtype!="virus" & viruses$keep_score_high_MCC>=1] <- "false positive"
accuracy:
length(grep("true", viruses$confusion_matrix_high_MCC))/nrow(viruses)
[1] 0.935778
recall
length(grep("true positive", viruses$confusion_matrix_high_MCC))/length(grep("virus", viruses$seqtype))
[1] 0.7078
There were 24 warnings (use warnings() to see them)
TP <- table(viruses$confusion_matrix_high_MCC)[4]
FP <- table(viruses$confusion_matrix_high_MCC)[2]
TN <- table(viruses$confusion_matrix_high_MCC)[3]
FN <- table(viruses$confusion_matrix_high_MCC)[1]
precision <- as.numeric(TP/(TP+FP))
precision
[1] 0.6845261
recall <- as.numeric(TP/(TP+FN))
recall
[1] 0.7078
F1 <- as.numeric(2*precision*recall/(precision+recall))
F1
[1] 0.6959685
MCC <- as.numeric((TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN)))
MCC
[1] 0.6601921
precision=69%, recall=87%, MCC=77%
precision adjusting size to be equal viral/not viral
TP <- table(viruses$confusion_matrix_high_MCC)[4]
There were 48 warnings (use warnings() to see them)
FP <- table(viruses$confusion_matrix_high_MCC)[2]*.11
TN <- table(viruses$confusion_matrix_high_MCC)[3]*.11
FN <- table(viruses$confusion_matrix_high_MCC)[1]
precision <- as.numeric(TP/(TP+FP))
precision
[1] 0.9561163
recall <- as.numeric(TP/(TP+FN))
recall
[1] 0.7523
F1 <- as.numeric(2*precision*recall/(precision+recall))
F1
[1] 0.8420504
MCC <- as.numeric((TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN)))
MCC
[1] 0.7293445
precision=0.95, recall=0.87, F1=0.91, MCC=0.82
visualizing confusion matrix by taxa
confusion_by_taxa <- viruses %>% count(confusion_matrix_high_MCC, seqtype, Index)
There were 24 warnings (use warnings() to see them)
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(index),
There were 24 warnings (use warnings() to see them)
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
differences based on genome size
viruses$size_class <- "3-5kb"
viruses$size_class[viruses$checkv_length>5000] <- "5-10kb"
viruses$size_class[viruses$checkv_length>10000] <- ">10kb"
confusion_by_taxa <- viruses %>% count(confusion_matrix_high_MCC, seqtype, size_class, Index)
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","size", "index", "count")
confusion_vir_called <- confusion_by_taxa %>% filter(confusion_matrix=="true positive" | confusion_matrix=="false positive")
There were 20 warnings (use warnings() to see them)
type_count <- viruses %>% count(seqtype, size_class, Index)
confusion_vir_called$per_viral <- 0
for (i in c(1:nrow(confusion_vir_called))) {
confusion_vir_called$per_viral[i] <- confusion_vir_called$count[i]/type_count$n[type_count$seqtype==confusion_vir_called$seqtype[i] &
type_count$Index==confusion_vir_called$index[i] &
type_count$size_class==confusion_vir_called$size[i]]*100
}
confusion_vir_called <- confusion_vir_called %>% group_by(seqtype, size) %>%
summarise(mean=mean(per_viral),
sd=sd(per_viral))
`summarise()` has grouped output by 'seqtype'. You can override using the `.groups` argument.
confusion_vir_called$size <- factor(confusion_vir_called$size,
levels = c("3-5kb", "5-10kb", ">10kb"))
ggplot(confusion_vir_called, aes(y=mean, x=size,
fill=seqtype,
color=seqtype)) +
geom_bar(stat="identity", position=position_dodge()) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
geom_errorbar(aes(ymin=mean-sd, ymax=mean+sd), width=.2,
position=position_dodge(.9)) +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
xlab("Length") +
ylab("Sequences Called Viral (%)")
viruses$keep_score_vb <- getting_viral_set_1(viruses, include_deepvirfinder = F,
There were 23 warnings (use warnings() to see them)
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_dvf <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_dvf_vs2 <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_dvf_vs2_vs <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = T)
viruses$keep_score_vb_dvf_vs2_vs_tv <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = T)
viruses$keep_score_vb_dvf_vs2_vs_tv_tnv <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = T)
viruses$true_virus <- "not"
viruses$true_virus[viruses$seqtype=="virus"] <- "virus"
viruses_long_scores <- viruses %>%
select(contains("keep_score_vb"), size_class, true_virus) %>%
pivot_longer(cols=contains("keep_score_"),
names_to="rule_combination",
values_to="viral_score") %>%
mutate(viral_score=as.factor(round(viral_score))) %>%
group_by(rule_combination, viral_score, size_class, true_virus) %>%
summarise(n = n())
`summarise()` has grouped output by 'rule_combination', 'viral_score', 'size_class'. You can override using the `.groups` argument.
viruses_long_scores$size_class <- factor(viruses_long_scores$size_class,
levels = c("3-5kb", "5-10kb", ">10kb"))
viruses_long_scores_addition <- viruses_long_scores[(viruses_long_scores$true_virus=="virus" & (viruses_long_scores$rule_combination!="keep_score_vb_dvf_vs2_vs_tv_tnv") & viruses_long_scores$viral_score!="0"),]
There were 23 warnings (use warnings() to see them)
ggplot(viruses_long_scores_addition, aes(y=n, x=rule_combination,
fill=viral_score)) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "bottom"
) +
scale_fill_brewer(palette = "Purples") +
xlab("") +
ylab("Number of Sequences") +
scale_x_discrete(labels=c("VB", "VB+DVF", "VB+DVF+VS2", "VB+DVF+VS2+VS",
"VB+DVF+VS2+VS+addition")) +
facet_grid(~true_virus, scales = "free")
ggplot(viruses_long_scores, aes(y=n, x=rule_combination,
There were 50 or more warnings (use warnings() to see the first 50)
fill=viral_score)) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "bottom"
) +
scale_fill_brewer(palette = "PuOr", ) +
xlab("") +
ylab("Number of Sequences") +
scale_x_discrete(labels=c("VB", "VB+DVF", "VB+DVF+VS2", "VB+DVF+VS2+VS",
"VB+DVF+VS2+VS+addition", "VB+DVF+VS2+VS+addition-removal")) +
facet_grid(~true_virus, scales = "free")
ggplot(viruses_long_scores, aes(y=n, x=rule_combination,
There were 20 warnings (use warnings() to see them)
fill=viral_score)) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "bottom"
) +
scale_fill_brewer(palette = "PuOr", ) +
xlab("") +
ylab("Number of Sequences") +
facet_grid(size_class~true_virus, scales = "free")
viruses_high <- viruses[viruses$keep_score_vb_dvf_vs2_vs_tv>=1,]
There were 50 or more warnings (use warnings() to see the first 50)
viruses_high_mod <- viruses_high %>% select(keep_score_vb,keep_score_vb_dvf,
keep_score_vb_dvf_vs2, keep_score_vb_dvf_vs2_vs,
keep_score_vb_dvf_vs2_vs_tv, keep_score_vb_dvf_vs2_vs_tv_tnv)
#viruses_high_mod <- apply(viruses_high_mod, c(1,2), function(x) {if (x >= 1) {x <- 1} else {x <- 0}})
viruses_high_mod <- as_tibble(viruses_high_mod)
sm_m <- reshape2::melt(viruses_high_mod)
No id variables; using all as measure variables
colnames(sm_m) <- c("method", "viral_score")
sm_m <- sm_m[sm_m$viral_score>0,]
sm_m$score <- sm_m$viral_score
sm_m$score[sm_m$viral_score==0.5] <- "0.5"
sm_m$score[sm_m$viral_score>=1] <- "1"
sm_m$score[sm_m$viral_score>=2] <- "2"
sm_m$score[sm_m$viral_score>=3] <- "3"
sm_m$score[sm_m$viral_score>=4] <- "4"
sm_m$score[sm_m$viral_score>=5] <- "5"
sm_m$score <- factor(sm_m$score,
levels=c("0.5", "1", "2","3","4","5"))
ggplot(sm_m, aes(x=method, y=score,
fill=score)) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "bottom"
) +
scale_fill_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 1)) +
xlab("") +
ylab("Number of Sequences") +
coord_flip()
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
viruses_mcc_alluvial <- data.frame(seqtype=viruses$seqtype,
keep_score_high_MCC=viruses$keep_score_high_MCC,
confusion_matrix_high_MCC=viruses$confusion_matrix_high_MCC)
viruses_mcc_alluvial$keep_score_vb <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_mcc_alluvial$keep_score_dvf <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_mcc_alluvial$keep_score_vs <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = F,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = T)
viruses_mcc_alluvial$keep_score_vs2 <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = F,
include_virsorter2 = T,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_mcc_alluvial$keep_score_tv <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = F,
include_virsorter2 = F,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_mcc_alluvial$keep_score_tnv <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = T,
include_virsorter = F)
viruses_mcc_alluvial %>%
count(seqtype, keep_score_high_MCC) %>%
spread(key = keep_score_high_MCC, value=n)
viruses_mcc_alluvial <- viruses_mcc_alluvial %>%
count(seqtype, keep_score_dvf, keep_score_vb, keep_score_vs,
keep_score_vs2, keep_score_tv, keep_score_tnv, keep_score_high_MCC) %>%
mutate(high_mcc_viral_score=factor(round(keep_score_high_MCC)))
ggplot(viruses_mcc_alluvial,
aes(axis1 = keep_score_dvf, axis2 = keep_score_vb,
axis3 = keep_score_vs, axis4 = keep_score_vs2,
axis5 = keep_score_tv, axis6 = keep_score_tnv,
y=n)) +
geom_alluvium(aes(fill=high_mcc_viral_score),
width = 0, knot.pos = 0, reverse = FALSE) +
geom_stratum(width = 1/5) +
theme_bw() +
geom_text(stat = "stratum", aes(label = after_stat(stratum)),
reverse = FALSE) +
theme(
axis.text.x=element_text(size=14, angle = 90)
) +
scale_x_continuous(breaks=c(1,2,3,4,5,6),
labels=c("dvf", "kj", "vs", "vs2",
"tv", "tnv")) +
scale_fill_brewer(palette = "PuOr", ) +
facet_wrap(~seqtype, scales="free_y")
viruses$keep_score_visualize <- viruses$keep_score_high_MCC
viruses$keep_score_visualize[viruses$keep_score_high_MCC>1] <- "> 1"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==1] <- "1"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==0.5] <- "0.5"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==0] <- "0"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==-0.5] <- "-0.5"
viruses$keep_score_visualize[viruses$keep_score_high_MCC==-1] <- "-1"
viruses$keep_score_visualize[viruses$keep_score_high_MCC<=-1] <- "< -1"
viruses$keep_score_visualize <- factor(viruses$keep_score_visualize,
levels=c("< -1", "-1", "-0.5", "0", "0.5","1", "> 1"))
#viruses$keep_score_visualize <- factor(viruses$keep_score_visualize,
# labels=c("≤ 0", "≤ 0", "≤ 0", "0.5","1", "> 1"))
levels(factor(viruses$keep_score_visualize))
ggplot(viruses, aes(x=as.factor(Index),
fill=keep_score_visualize, color=keep_score_visualize)) +
geom_bar(stat="count", position="stack") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16)
) +
scale_color_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 1)) +
scale_fill_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 0.5)) +
xlab("Index") +
ylab("Sequence Count") +
facet_wrap(~confusion_matrix_high_MCC, scales = "free")
viral_scores <- matrix(data=0, nrow=nrow(viruses), ncol=nrow(combos_list))
num_viruses <- data.frame(toolcombo=rep(0, nrow(combos_list)),
num_viruses=rep(0, nrow(combos_list)))
for (i in 1:nrow(combos_list)) {
viral_scores[,i] <- getting_viral_set_1(viruses, include_vibrant = combos_list$VIBRANT[i],
include_virsorter = combos_list$VS[i],
include_virsorter2 = combos_list$VS2[i],
include_tuning_viral = combos_list$tune_viral[i],
include_tuning_not_viral = combos_list$tune_not_viral[i],
include_deepvirfinder = combos_list$DVF[i])
if (max(viral_scores[,i])<=0) {
num_viruses$num_viruses[i] <- 0
}
else {
num_viruses$num_viruses[i] <- table(viral_scores[,i]>=1)[[2]]
}
num_viruses$toolcombo[i] <- combos_list$toolcombo[i]
num_viruses$toolcombo2[i] <- combos_list$toolcombo2[i]
}
num_viruses$numrules <- str_count(num_viruses$toolcombo, "1")
num_viruses <- num_viruses[order(num_viruses$num_viruses, decreasing=F),]
num_viruses$toolcombo <- factor(num_viruses$toolcombo, levels = unique(num_viruses$toolcombo))
num_viruses$toolcombo2 <- factor(num_viruses$toolcombo2, levels = unique(num_viruses$toolcombo2))
num_viruses$numrules <- as.factor(num_viruses$numrules)
viral_scores_nozeros <- viral_scores[rowSums(viral_scores)>0,]
viral_scores_nozeros <- viral_scores_nozeros + 1
viral_scores_nozeros <- as.data.frame(viral_scores_nozeros)
colnames(viral_scores_nozeros) <- num_viruses$toolcombo
library(phyloseq)
tooldata <- num_viruses
rownames(tooldata) <- tooldata$toolcombo
physeq_pooled <- phyloseq(otu_table(viral_scores_nozeros, taxa_are_rows = T),
sample_data(tooldata))
ordination <- phyloseq::ordinate(physeq =physeq_pooled, method = "PCoA", distance = "bray")
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numrules", color="num_viruses") +
geom_point(size = 3) +
theme_bw() +
geom_label(label=tooldata$toolcombo)
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numrules", color="num_viruses") +
geom_point(size = 3) +
theme_bw()
bray_dist <- phyloseq::distance(physeq_pooled, method="bray")
clusters <- hclust(dist(bray_dist))
plot(clusters)
myclusters <- cutree(clusters, h=0.8)
#names(myclusters[myclusters==1])
#names(myclusters[myclusters==2])
#names(myclusters[myclusters==3])
#names(myclusters[myclusters==4])
#names(myclusters[myclusters==5])
myclusters_df <- tibble(combo=names(myclusters),
cluster_index=myclusters)
myclusters_df <- separate(myclusters_df, col=combo, into=c("tnv", "DVF",
"tv", "VB",
"VS", "VS2"),
sep=" ", remove = F)
tool_count <- as.data.frame(rbind(table(myclusters_df$tnv, myclusters_df$cluster_index)[2,],
table(myclusters_df$DVF, myclusters_df$cluster_index)[2,],
table(myclusters_df$tv, myclusters_df$cluster_index)[2,],
table(myclusters_df$VB, myclusters_df$cluster_index)[2,],
table(myclusters_df$VS, myclusters_df$cluster_index)[2,],
table(myclusters_df$VS2, myclusters_df$cluster_index)[2,])
)
tool_count <- data.frame(t(apply(tool_count, c(1), function(x) {x <- x/table(myclusters_df$cluster_index)})))
tool_count$method <- c("tnv", "DVF", "tv", "VB", "VS", "VS2")
tool_count <- melt(tool_count)
colnames(tool_count) <- c("tool", "cluster_index", "tool_count_norm")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(tool_count, aes(x=tool, y=tool_count_norm,
fill=tool,
color=tool)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "none",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
#legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
xlab("Tool") +
ylab("Proportion of Times in Cluster") +
facet_wrap(~cluster_index, nrow=1)
accuracy_scores_melt <- accuracy_scores %>%
select(precision, recall, MCC, numrules, toolcombo) %>%
group_by(numrules, toolcombo) %>%
summarise(precision=mean(precision),
recall=mean(recall),
MCC=mean(MCC)) %>%
pivot_longer(cols=c(precision, recall, MCC),
names_to="performance_metric",
values_to="performance_metric_score")
myclusters_df <- inner_join(accuracy_scores_melt, myclusters_df,
by=c("toolcombo"="combo"))
myclusters_df$cluster_index <- as.factor(myclusters_df$cluster_index)
ggplot(myclusters_df, aes(x=cluster_index, y=performance_metric_score,
color=cluster_index, fill=cluster_index)) +
geom_boxplot() +
geom_point(alpha=0.5) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
ylab("MCC") +
xlab("Cluster") +
scale_fill_manual(name="",
values = alpha(rev(pal(9)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(9)), 1)) +
facet_wrap(~performance_metric)
viruses$keep_score_all <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = T)
viruses$confusion_matrix_all <- "true negative"
viruses$confusion_matrix_all[viruses$seqtype=="virus" & viruses$keep_score_all<1] <- "false negative"
viruses$confusion_matrix_all[viruses$seqtype=="virus" & viruses$keep_score_all>=1] <- "true positive"
viruses$confusion_matrix_all[viruses$seqtype!="virus" & viruses$keep_score_all>=1] <- "false positive"
TP <- table(viruses$confusion_matrix_all)[4]
FP <- table(viruses$confusion_matrix_all)[2]
TN <- table(viruses$confusion_matrix_all)[3]
FN <- table(viruses$confusion_matrix_all)[1]
precision <- TP/(TP+FP)
precision
recall <- TP/(TP+FN)
recall
F1 <- 2*precision*recall/(precision+recall)
F1
MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
MCC
precision=62%, recall=92%, MCC=73%
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_all, viruses$seqtype, viruses$Index))
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
table(viruses$confusion_matrix_all)
length(grep("true", viruses$confusion_matrix_all))/nrow(viruses)
length(grep("true positive", viruses$confusion_matrix_all))/length(grep("virus", viruses$seqtype))
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
viruses$keep_score_high_recall <- getting_viral_set_1(viruses, include_deepvirfinder = T,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = T)
viruses$confusion_matrix_high_recall <- "true negative"
viruses$confusion_matrix_high_recall[viruses$seqtype=="virus" & viruses$keep_score_high_recall<1] <- "false negative"
viruses$confusion_matrix_high_recall[viruses$seqtype=="virus" & viruses$keep_score_high_recall>=1] <- "true positive"
viruses$confusion_matrix_high_recall[viruses$seqtype!="virus" & viruses$keep_score_high_recall>=1] <- "false positive"
visualizing confusion matrix by taxa
confusion_by_taxa <- melt(table(viruses$confusion_matrix_high_recall, viruses$seqtype, viruses$Index))
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","Index", "count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
p2 <- ggplot(confusion_by_taxa, aes(x=count, y=as.factor(Index),
fill=confusion_matrix,
color=confusion_matrix)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
p2
accuracy:
length(grep("true", viruses$confusion_matrix_high_recall))/nrow(viruses)
0.887
recall
length(grep("true positive", viruses$confusion_matrix_high_recall))/length(grep("virus", viruses$seqtype))
recover almost all of the viruses this way, but more protist contamination
0.960
confusion_by_taxa <- viruses %>% count(confusion_matrix_high_recall, seqtype, size_class)
colnames(confusion_by_taxa) <- c("confusion_matrix", "seqtype","size", "count")
viruses$keep_score_few_tools <- getting_viral_set_1(viruses, include_deepvirfinder = F,
There were 50 or more warnings (use warnings() to see the first 50)
include_vibrant = F,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = T,
include_virsorter = F)
viruses$confusion_matrix_few_tools <- "true negative"
viruses$confusion_matrix_few_tools[viruses$seqtype=="virus" & viruses$keep_score_few_tools<1] <- "false negative"
viruses$confusion_matrix_few_tools[viruses$seqtype=="virus" & viruses$keep_score_few_tools>=1] <- "true positive"
viruses$confusion_matrix_few_tools[viruses$seqtype!="virus" & viruses$keep_score_few_tools>=1] <- "false positive"
TP <- table(viruses$confusion_matrix_few_tools)[4]
FP <- table(viruses$confusion_matrix_few_tools)[2]
TN <- table(viruses$confusion_matrix_few_tools)[3]
FN <- table(viruses$confusion_matrix_few_tools)[1]
precision <- TP/(TP+FP)
precision
recall <- TP/(TP+FN)
recall
F1 <- 2*precision*recall/(precision+recall)
F1
MCC <- (TP*TN-FP*FN)/sqrt(as.numeric(TP+FP)*as.numeric(TP+FN)*as.numeric(TN+FP)*as.numeric(TN+FN))
MCC
precision=77%, recall=76%, MCC=74%
confusion_by_taxa_method <- viruses %>%
select(contains("confusion_matrix"), seqtype, Index) %>%
pivot_longer(cols=contains("confusion_matrix"),
names_to="confusion_matrix_type",
values_to="confusion_matrix_value") %>%
count(seqtype, Index, confusion_matrix_type, confusion_matrix_value)
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(confusion_by_taxa_method, aes(y=n, x=confusion_matrix_type,
fill=confusion_matrix_value,
color=confusion_matrix_value)) +
geom_boxplot() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free")
confusion_by_taxa_method <- viruses %>%
select(contains("confusion_matrix"), seqtype, Index) %>%
pivot_longer(cols=contains("confusion_matrix"),
names_to="confusion_matrix_type",
values_to="confusion_matrix_value") %>%
count(seqtype, Index, confusion_matrix_type, confusion_matrix_value) %>%
filter(grepl("true", confusion_matrix_value)) %>%
mutate(confusion_matrix_type=sub("confusion_matrix_", "", confusion_matrix_type))
ggplot(confusion_by_taxa_method, aes(y=n, x=confusion_matrix_type,
fill=confusion_matrix_value,
color=confusion_matrix_value)) +
geom_boxplot() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=10, angle=90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4))[3:4], 0.5),
labels=c(
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4))[3:4], 1),
labels=c(
"true negative", "true positive")) +
xlab("Tool Set") +
ylab("Contig Count") +
facet_wrap(~seqtype, scales = "free")
viruses$true_virus <- "not"
There were 40 warnings (use warnings() to see them)
viruses$true_virus[viruses$seqtype=="virus"] <- "virus"
viruses_long_scores <- viruses %>%
select(contains("keep_score_high"), contains("keep_score_all"), size_class, true_virus) %>%
pivot_longer(cols=contains("keep_score_"),
names_to="rule_combination",
values_to="viral_score") %>%
mutate(viral_score=as.factor(round(viral_score))) %>%
group_by(rule_combination, viral_score, size_class, true_virus) %>%
summarise(n = n())
`summarise()` has grouped output by 'rule_combination', 'viral_score', 'size_class'. You can override using the `.groups` argument.
viruses_long_scores$size_class <- factor(viruses_long_scores$size_class,
levels = c("3-5kb", "5-10kb", ">10kb"))
ggplot(viruses_long_scores, aes(y=n, x=rule_combination,
There were 20 warnings (use warnings() to see them)
fill=viral_score)) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "bottom"
) +
scale_fill_brewer(palette = "PuOr", ) +
xlab("") +
ylab("Number of Sequences") +
facet_grid(~true_virus, scales = "free")
ggplot(viruses_long_scores, aes(y=n, x=rule_combination,
There were 20 warnings (use warnings() to see them)
fill=viral_score)) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
legend.position = "bottom"
) +
scale_fill_brewer(palette = "PuOr", ) +
xlab("") +
ylab("Number of Sequences") +
facet_grid(size_class~true_virus, scales = "free")
Extra Stuff #####################################################################
ggplot(viruses, aes(x=checkv_length, y=keep_score_high_MCC,
fill=confusion_matrix_high_MCC,
color=confusion_matrix_high_MCC)) +
geom_point(stat="identity", shape=21) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Sequence Length (bp)") +
ylab("Pipeline Viral Score") +
facet_wrap(~seqtype) +
scale_x_log10()
ggplot(viruses, aes(x=checkv_completeness, y=hallmark,
fill=confusion_matrix_high_recall,
color=confusion_matrix_high_recall)) +
geom_point(stat="identity", shape=21) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("CheckV Completeness") +
ylab("Number of Hallmark Genes") +
facet_wrap(~seqtype) +
scale_x_log10()
ggplot(viruses, aes(x=checkv_completeness, y=keep_score_high_MCC,
fill=confusion_matrix_high_recall,
color=confusion_matrix_high_recall)) +
geom_point(stat="identity", shape=21) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("CheckV Completeness") +
ylab("Pipeline Viral Score") +
facet_wrap(~seqtype) +
scale_x_log10()
ggplot(viruses, aes(x=confusion_matrix_high_recall, y=checkv_length,
fill=confusion_matrix_high_recall,
color=confusion_matrix_high_recall)) +
geom_boxplot() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Sequence Length (bp)") +
ylab("Pipeline Viral Score") +
scale_y_log10()
looking at false negatives
viruses_false_negs <- viruses[(viruses$seqtype=="virus" & viruses$keep_score_high_recall<1),]
looking at protists calling viral
viruses_false_pos_protists <- viruses[(viruses$seqtype=="protist" & viruses$keep_score_high_recall>=1),]
viruses$keep_score_vb <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = F,
include_tuning_viral = F,
include_tuning_not_viral = F,
include_virsorter = F)
viruses$keep_score_vb_tv <- getting_viral_set_1(viruses, include_deepvirfinder = F,
include_vibrant = T,
include_virsorter2 = T,
include_tuning_viral = T,
include_tuning_not_viral = F,
include_virsorter = F)
viruses_high <- viruses[viruses$keep_score_vb_tv>=1,] #uncomment this line if want to use all 6 tools
viruses_high_mod <- viruses_high %>% select(keep_score_vb,
keep_score_vb_tv)
#viruses_high_mod <- apply(viruses_high_mod, c(1,2), function(x) {if (x >= 1) {x <- 1} else {x <- 0}})
viruses_high_mod <- as_tibble(viruses_high_mod)
sm_m <- reshape2::melt(viruses_high_mod)
colnames(sm_m) <- c("method", "score")
ggplot(sm_m, aes(x=method, y=score,
fill=as.factor(score))) +
geom_bar(stat="identity") +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom"
) +
scale_fill_manual(name = 'Number of Methods',
values = alpha(c(viridis(14)), 1)) +
xlab("Primary Method") +
ylab("Count of Viral Contigs") +
coord_flip()
library(pROC)
viruses$truepositive <- rep(0, nrow(viruses))
viruses$truepositive[viruses$seqtype=="virus"] <- 1
rocobj <- roc(viruses$truepositive, viruses$keep_score)
rocobj_all <- roc(viruses$truepositive, viruses$keep_score_all)
auc <- round(auc(viruses$truepositive, viruses$keep_score),4)
auc_all <- round(auc(viruses$truepositive, viruses$keep_score_all),4)
#create ROC plot
ggroc(rocobj, colour = 'steelblue', size = 2) +
ggtitle(paste0('ROC Curve ', '(AUC = ', auc, ')')) +
coord_equal()
ggroc(rocobj_all, colour = 'green', size = 2) +
ggtitle(paste0('ROC Curve ', '(AUC = ', auc_all, ')'))
Sensitivity: The probability that the model predicts a positive outcome for an observation when indeed the outcome is positive. Specificity: The probability that the model predicts a negative outcome for an observation when indeed the outcome is negative.
viral_scores <- matrix(data=0, nrow=nrow(viruses), ncol=nrow(combos_list))
num_viruses <- data.frame(toolcombo=rep(0, nrow(combos_list)),
num_viruses=rep(0, nrow(combos_list)))
for (i in 1:nrow(combos_list)) {
viral_scores[,i] <- getting_viral_set_1(viruses, include_vibrant = combos_list$VIBRANT[i],
include_virsorter = combos_list$VS[i],
include_virsorter2 = combos_list$VS2[i],
include_tuning = combos_list$CheckV[i],
include_kaiju = combos_list$Kaiju[i],
include_deepvirfinder = combos_list$DVF[i])
num_viruses$num_viruses[i] <- table(viral_scores[,i]>=1)[[2]]
num_viruses$toolcombo[i] <- combos_list$toolcombo[i]
num_viruses$toolcombo2[i] <- combos_list$toolcombo2[i]
}
num_viruses$numrules <- str_count(num_viruses$toolcombo, "1")
num_viruses <- num_viruses[order(num_viruses$num_viruses, decreasing=F),]
num_viruses$toolcombo <- factor(num_viruses$toolcombo, levels = unique(num_viruses$toolcombo))
num_viruses$toolcombo2 <- factor(num_viruses$toolcombo2, levels = unique(num_viruses$toolcombo2))
num_viruses$numrules <- as.factor(num_viruses$numrules)
ggplot(num_viruses, aes(x=toolcombo, y=num_viruses,
color=numrules, fill=numrules)) +
geom_point() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Num Viruses Predicted")
ggplot(num_viruses, aes(x=toolcombo2, y=num_viruses,
color=numrules, fill=numrules)) +
geom_point() +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Tool Combination (CV, DVF, KJ, VB, VS, VS2)") +
ylab("Num Viruses Predicted")
ggplot(num_viruses, aes(x=numrules, y=num_viruses)) +
geom_boxplot(aes(color=numrules)) +
geom_point(aes(color=numrules, fill=numrules)) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14, angle = 90),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Tools") +
ylab("Num Viruses Predicted")
viral_scores_nozeros <- viral_scores[rowSums(viral_scores)>0,]
viral_scores_nozeros <- viral_scores_nozeros + 1
viral_scores_nozeros <- as.data.frame(viral_scores_nozeros)
colnames(viral_scores_nozeros) <- num_viruses$toolcombo2
library(phyloseq)
tooldata <- num_viruses
rownames(tooldata) <- tooldata$toolcombo2
physeq_pooled <- phyloseq(otu_table(viral_scores_nozeros, taxa_are_rows = T),
sample_data(tooldata))
ordination <- phyloseq::ordinate(physeq =physeq_pooled, method = "PCoA", distance = "bray")
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numrules", color="num_viruses") +
geom_point(size = 3) +
theme_bw() +
geom_label(label=tooldata$toolcombo)
phyloseq::plot_ordination(physeq = physeq_pooled, ordination = ordination,
shape="numrules", color="num_viruses") +
geom_point(size = 3) +
theme_bw()
to do: try coloring above based on the F1 scores of the testing set on each combination
bray_dist <- phyloseq::distance(physeq_pooled, method="bray")
clusters <- hclust(dist(bray_dist))
plot(clusters)
myclusters <- cutree(clusters, h=1.1)
names(myclusters[myclusters==1])
names(myclusters[myclusters==2])
names(myclusters[myclusters==3])
names(myclusters[myclusters==4])
names(myclusters[myclusters==5])
myclusters_df <- tibble(combo=names(myclusters),
cluster_index=myclusters)
myclusters_df <- separate(myclusters_df, col=combo, into=c("CheckV", "DVF",
"Kaiju", "VIBRANT",
"VirSorter", "VirSorter2"),
sep=" ", remove = F)
tool_count <- as.data.frame(rbind(table(myclusters_df$CheckV, myclusters_df$cluster_index)[2,],
table(myclusters_df$DVF, myclusters_df$cluster_index)[2,],
table(myclusters_df$Kaiju, myclusters_df$cluster_index)[2,],
table(myclusters_df$VIBRANT, myclusters_df$cluster_index)[2,],
table(myclusters_df$VirSorter, myclusters_df$cluster_index)[2,],
table(myclusters_df$VirSorter2, myclusters_df$cluster_index)[2,])
)
tool_count$method <- c("CheckV", "DVF", "Kaiju", "VIBRANT", "VirSorter", "VirSorter2")
tool_count <- melt(tool_count)
colnames(tool_count) <- c("tool", "cluster_index", "tool_count")
pal <- ggthemes::tableau_color_pal(palette="Tableau 10", type="regular")
ggplot(tool_count, aes(x=cluster_index, y=tool_count,
fill=cluster_index,
color=cluster_index)) +
geom_bar(stat="identity") +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(6)), 0.5)) +
scale_color_manual(name="",
values = alpha(rev(pal(6)), 1)) +
xlab("Cluster") +
ylab("Number of Times in Cluster") +
facet_wrap(~tool, scales = "free")
ggplot(viruses, aes(x=checkv_viral_genes, y=confusion_matrix_high_precision,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_boxplot(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Viral Sequences") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=percent_viral, y=confusion_matrix_high_precision,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_boxplot(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Percent Genes Viral") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=hallmark, y=confusion_matrix_high_precision,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_boxplot(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Hallmark Genes") +
ylab("") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
fill=confusion_matrix_high_precision,
color=confusion_matrix_high_precision)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
scale_fill_manual(name="",
values = alpha(rev(pal(4)), 0.5),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
scale_color_manual(name="",
values = alpha(rev(pal(4)), 1),
labels=c("false negative", "false positive",
"true negative", "true positive")) +
xlab("Number of Hallmark Genes") +
ylab("Number of Viral Genes") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
viruses_false_positive <- viruses[viruses$confusion_matrix_high_precision=="false positive",]
viruses_false_negative <- viruses[viruses$confusion_matrix_high_precision=="false negative",]
ggplot(viruses, aes(x=hallmark, y=checkv_viral_genes,
fill=checkv_length,
color=checkv_length,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Number of Viral Genes") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses_false_positive, aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~seqtype, scales = "free") +
coord_flip()
ggplot(viruses_false_positive[viruses_false_positive$seqtype=="bacteria"], aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_positive[viruses_false_positive$seqtype=="fungi"], aes(x=hallmark, y=checkv_length,
fill=keep_score_high_precision,
color=keep_score_high_precision,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_positive[viruses_false_positive$seqtype=="protist"], aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
fill=checkv_viral_genes,
color=checkv_viral_genes,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
ggplot(viruses_false_negative, aes(x=hallmark, y=checkv_length,
fill=keep_score_high_precision,
color=keep_score_high_precision,
shape=checkv_provirus)) +
geom_point(alpha=0.3) +
theme_light() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16),
) +
xlab("Number of Hallmark Genes") +
ylab("Contig Length") +
facet_wrap(~Kaiju_Viral, scales = "free") +
coord_flip()
table(viruses$hallmark[viruses$confusion_matrix_high_precision=="false positive"]>0)
table(viruses$percent_host[viruses$confusion_matrix_high_precision=="false positive"]<50)
confusion_by_keep_score <- viruses %>%
select(contains("keep_score"), seqtype, Index) %>%
pivot_longer(cols=contains("keep_score"),
names_to="pipeline_type",
values_to="pipeline_value") %>%
mutate(pipeline_type=sub("keep_score_", "", pipeline_type)) %>%
count(seqtype, Index, pipeline_type, pipeline_value)
confusion_by_keep_score$confusion_matrix <- "true negative"
confusion_by_keep_score$confusion_matrix[confusion_by_keep_score$seqtype=="virus" & confusion_by_keep_score$pipeline_value<1] <- "false negative"
confusion_by_keep_score$confusion_matrix[confusion_by_keep_score$seqtype=="virus" & confusion_by_keep_score$pipeline_value>=1] <- "true positive"
confusion_by_keep_score$confusion_matrix[confusion_by_keep_score$seqtype!="virus" & confusion_by_keep_score$pipeline_value>=1] <- "false positive"
confusion_by_keep_score$keep_score_visualize <- confusion_by_keep_score$pipeline_value
confusion_by_keep_score$keep_score_visualize[confusion_by_keep_score$pipeline_value>1] <- "> 1"
confusion_by_keep_score$keep_score_visualize[confusion_by_keep_score$pipeline_value==1] <- "1"
confusion_by_keep_score$keep_score_visualize[confusion_by_keep_score$pipeline_value==0.5] <- "0.5"
confusion_by_keep_score$keep_score_visualize[confusion_by_keep_score$pipeline_value==0] <- "0"
confusion_by_keep_score$keep_score_visualize[confusion_by_keep_score$pipeline_value==-0.5] <- "-0.5"
confusion_by_keep_score$keep_score_visualize[confusion_by_keep_score$pipeline_value==-1] <- "-1"
confusion_by_keep_score$keep_score_visualize[confusion_by_keep_score$pipeline_value<=-1] <- "< -1"
confusion_by_keep_score$keep_score_visualize <- factor(confusion_by_keep_score$keep_score_visualize,
levels=c("< -1", "-1", "-0.5", "0", "0.5","1", "> 1"))
ggplot(confusion_by_keep_score, aes(x=confusion_matrix, y=n,
fill=keep_score_visualize, color=keep_score_visualize)) +
geom_boxplot() +
theme_light() +
coord_flip() +
theme(
panel.grid.major.y = element_blank(),
panel.border = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
axis.text.y=element_text(size=14),
axis.text.x=element_text(size=14),
legend.text=element_text(size=12),
axis.title=element_text(size=16)
) +
scale_color_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 1)) +
scale_fill_manual(name = 'Viral Score',
values = alpha(c(viridis(6)), 0.5)) +
xlab("Index") +
ylab("Sequence Count") +
facet_wrap(~pipeline_type, scales = "free")